This is me fiddling around with an employee attrition dataset on my last few days at PNC.
Uncover the factors that lead to employee attrition and explore important questions such as ‘show me a breakdown of distance from home by job role and attrition’ or ‘compare average monthly income by education and attrition’. This is a fictional data set created by IBM data scientists.
Education 1 ‘Below College’ 2 ‘College’ 3 ‘Bachelor’ 4 ‘Master’ 5 ‘Doctor’
EnvironmentSatisfaction 1 ‘Low’ 2 ‘Medium’ 3 ‘High’ 4 ‘Very High’
JobInvolvement 1 ‘Low’ 2 ‘Medium’ 3 ‘High’ 4 ‘Very High’
JobSatisfaction 1 ‘Low’ 2 ‘Medium’ 3 ‘High’ 4 ‘Very High’
PerformanceRating 1 ‘Low’ 2 ‘Good’ 3 ‘Excellent’ 4 ‘Outstanding’
RelationshipSatisfaction 1 ‘Low’ 2 ‘Medium’ 3 ‘High’ 4 ‘Very High’
WorkLifeBalance 1 ‘Bad’ 2 ‘Good’ 3 ‘Better’ 4 ‘Best’
library(tidyverse)
library(plotly)
library(readr)
library(devtools)
library(caret)
df <- read_csv(file='ibm_attrition_file.csv')
my_PNC_theme <- theme_bw()
my_PNC_colors = c("#0452cf", "#2c67c7", "#5a83c4", "#b0b0ac", "#e8a061", "#e88631", "#e86e05")
my_PNC_colors_BIG = c("#0080FF","#006DD9", "#005AB2", "#2693FF", "#4CA6FF", "#73B9FF", "#b0b0ac", "#FFB973","#FFA54C","#FF9226", "#D96C00", "#8C4600", "#B25900", "#FF7F00")
head(df)
## # A tibble: 6 x 35
## Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education
## <dbl> <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 41 Yes Travel_Rarely 1102 Sales 1 2
## 2 49 No Travel_Freque… 279 Research … 8 1
## 3 37 Yes Travel_Rarely 1373 Research … 2 2
## 4 33 No Travel_Freque… 1392 Research … 3 4
## 5 27 No Travel_Rarely 591 Research … 2 1
## 6 32 No Travel_Freque… 1005 Research … 2 2
## # … with 28 more variables: EducationField <chr>, EmployeeCount <dbl>,
## # EmployeeNumber <dbl>, EnvironmentSatisfaction <dbl>, Gender <chr>,
## # HourlyRate <dbl>, JobInvolvement <dbl>, JobLevel <dbl>, JobRole <chr>,
## # JobSatisfaction <dbl>, MaritalStatus <chr>, MonthlyIncome <dbl>,
## # MonthlyRate <dbl>, NumCompaniesWorked <dbl>, Over18 <chr>, OverTime <chr>,
## # PercentSalaryHike <dbl>, PerformanceRating <dbl>,
## # RelationshipSatisfaction <dbl>, StandardHours <dbl>,
## # StockOptionLevel <dbl>, TotalWorkingYears <dbl>,
## # TrainingTimesLastYear <dbl>, WorkLifeBalance <dbl>, YearsAtCompany <dbl>,
## # YearsInCurrentRole <dbl>, YearsSinceLastPromotion <dbl>,
## # YearsWithCurrManager <dbl>
nrow(df)
## [1] 1470
plot_ly(alpha = .7) %>%
add_histogram(
x = ~df$Attrition,
name = 'Employee Attrition'
) %>%
layout(
title = "~15% employee churn in dataset",
xaxis = list(
title = 'Attrition',
zeroline = FALSE
),
yaxis = list(
title = 'Count',
zeroline = FALSE
)
)
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
age_dist_plot <- ggplot(data = df, mapping = aes(x = Age)) +
geom_histogram(fill = my_PNC_colors[2],
line = list(color = "darkgray",
width = 5),bins = 30
) +
labs(title = "Employees distributed by age",
x = "Age",
y = "Count") +
geom_vline(xintercept = median(df$Age), col = my_PNC_colors[6]) +
geom_text(x = median(df$Age), y = 4000, label = "50%", col = my_PNC_colors[6], size = 3) +
geom_vline(xintercept = quantile(df$Age, .25), col = my_PNC_colors[6], linetype = "dashed") +
geom_text(x = quantile(df$Age, .25), y = -20, label = "25%", col = my_PNC_colors[6], size = 2.5) +
geom_vline(xintercept = quantile(df$Age, .75), col = my_PNC_colors[6], linetype = "dashed") +
geom_text(x = quantile(df$Age, .75), y = -20, label = "75%", col = my_PNC_colors[6], size = 2.5) +
my_PNC_theme
## Warning: Ignoring unknown parameters: line
ggplotly(age_dist_plot)
Attrition rates are higher among younger employees
df_attr <- df %>% filter(Attrition=='Yes')
p_age_dist <- plot_ly(alpha = .7) %>%
add_histogram(x = ~df$Age,
name = "All Employees",
bingroup = 1) %>%
add_histogram(x = ~df_attr$Age,
name = "Attrited Employees",
bingroup = 1) %>%
layout(barmode = "overlay",
title = "Employee Attrition Distributed by Age",
xaxis = list(title = "Age",
zeroline = FALSE),
yaxis = list(title = "Count",
zeroline = FALSE))
p_age_dist
High Job Satisfaction still yields attrition but at a lower frequency.
fig <- plot_ly(
type='histogram',
x=as.character(df$JobSatisfaction),
bingroup=1)
fig <- fig %>% add_trace(
type='histogram',
x=as.character(df[which(df$Attrition=='Yes'),]$JobSatisfaction),
bingroup=1)
fig <- fig %>% layout(
barmode="overlay",
bargap=0.1)
fig
Single men and women are more likely to leave the company but age may be a confounder. Men are slightly more likely to leave than women.
df %>%
group_by(Gender, MaritalStatus) %>%
summarize(attr_pct = sum(ifelse(Attrition=='Yes', 1, 0))/n()) -> df_marital
## `summarise()` regrouping output by 'Gender' (override with `.groups` argument)
plot_ly(
x = df_marital$Gender,
y = df_marital$MaritalStatus,
z = df_marital$attr_pct,
type='heatmap'
)
library(heatmaply)
## Loading required package: viridis
## Loading required package: viridisLite
##
## ======================
## Welcome to heatmaply version 1.1.1
##
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
##
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## Or contact: <tal.galili@gmail.com>
## ======================
library(ggcorrplot)
num_x <- dplyr::select_if(df, is.numeric) %>%
select(-c(EmployeeCount, EmployeeNumber, StandardHours))
heatmaply_cor(
cor(num_x),
xlab = "Features",
ylab = "Features",
k_col = 2,
k_row = 2
)
## Warning in doTryCatch(return(expr), name, parentenv, handler): unable to load shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so':
## dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 6): Library not loaded: /opt/X11/lib/libSM.6.dylib
## Referenced from: /Library/Frameworks/R.framework/Versions/3.6/Resources/modules/R_X11.so
## Reason: image not found
names(df)
## [1] "Age" "Attrition"
## [3] "BusinessTravel" "DailyRate"
## [5] "Department" "DistanceFromHome"
## [7] "Education" "EducationField"
## [9] "EmployeeCount" "EmployeeNumber"
## [11] "EnvironmentSatisfaction" "Gender"
## [13] "HourlyRate" "JobInvolvement"
## [15] "JobLevel" "JobRole"
## [17] "JobSatisfaction" "MaritalStatus"
## [19] "MonthlyIncome" "MonthlyRate"
## [21] "NumCompaniesWorked" "Over18"
## [23] "OverTime" "PercentSalaryHike"
## [25] "PerformanceRating" "RelationshipSatisfaction"
## [27] "StandardHours" "StockOptionLevel"
## [29] "TotalWorkingYears" "TrainingTimesLastYear"
## [31] "WorkLifeBalance" "YearsAtCompany"
## [33] "YearsInCurrentRole" "YearsSinceLastPromotion"
## [35] "YearsWithCurrManager"
head(df)
## # A tibble: 6 x 35
## Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education
## <dbl> <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 41 Yes Travel_Rarely 1102 Sales 1 2
## 2 49 No Travel_Freque… 279 Research … 8 1
## 3 37 Yes Travel_Rarely 1373 Research … 2 2
## 4 33 No Travel_Freque… 1392 Research … 3 4
## 5 27 No Travel_Rarely 591 Research … 2 1
## 6 32 No Travel_Freque… 1005 Research … 2 2
## # … with 28 more variables: EducationField <chr>, EmployeeCount <dbl>,
## # EmployeeNumber <dbl>, EnvironmentSatisfaction <dbl>, Gender <chr>,
## # HourlyRate <dbl>, JobInvolvement <dbl>, JobLevel <dbl>, JobRole <chr>,
## # JobSatisfaction <dbl>, MaritalStatus <chr>, MonthlyIncome <dbl>,
## # MonthlyRate <dbl>, NumCompaniesWorked <dbl>, Over18 <chr>, OverTime <chr>,
## # PercentSalaryHike <dbl>, PerformanceRating <dbl>,
## # RelationshipSatisfaction <dbl>, StandardHours <dbl>,
## # StockOptionLevel <dbl>, TotalWorkingYears <dbl>,
## # TrainingTimesLastYear <dbl>, WorkLifeBalance <dbl>, YearsAtCompany <dbl>,
## # YearsInCurrentRole <dbl>, YearsSinceLastPromotion <dbl>,
## # YearsWithCurrManager <dbl>
library(dplyr)
df %>%
dplyr::select(Attrition, Age, BusinessTravel, DailyRate, Department, DistanceFromHome, Education, EnvironmentSatisfaction,
Gender, HourlyRate, JobInvolvement,JobLevel, JobRole, JobSatisfaction, MaritalStatus, MonthlyIncome, MonthlyRate,
NumCompaniesWorked, OverTime, PercentSalaryHike, PerformanceRating, RelationshipSatisfaction, StockOptionLevel,
TotalWorkingYears, TrainingTimesLastYear, WorkLifeBalance, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion,
YearsWithCurrManager) -> df_pred
## 75% of the sample size
smp_size <- floor(0.75 * nrow(df_pred))
## set the seed to make your partition reproducible
set.seed(123)
train_ind <- sample(seq_len(nrow(df_pred)), size = smp_size)
train <- df_pred[train_ind, ]
test <- df_pred[-train_ind, ]
test %>% group_by(Attrition) %>% tally()
## # A tibble: 2 x 2
## Attrition n
## <chr> <int>
## 1 No 302
## 2 Yes 66
train %>% group_by(Attrition) %>% tally()
## # A tibble: 2 x 2
## Attrition n
## <chr> <int>
## 1 No 931
## 2 Yes 171
Linear Discriminant Analysis (LDA) is a feature reduction method for data with discrete classes. It is like PCA except that it takes advantage of information about the classification in the training data. It projects data into fewer dimensions by maximizing both the mean distance between the median data point of each class and minimizing the “spread” within each class.
library(MASS)
library(caret)
# Estimate preprocessing parameters
preproc.param <- train %>%
preProcess(method = c("center", "scale"))
# Transform the data using the estimated parameters
train.transformed <- preproc.param %>% predict(train)
test.transformed <- preproc.param %>% predict(test)
fit <- lda(Attrition ~ ., data=train.transformed)
# Make predictions
predictions <- fit %>% predict(test.transformed)
# Model accuracy
mean(predictions$class==test.transformed$Attrition)
## [1] 0.8695652
fit
## Call:
## lda(Attrition ~ ., data = train.transformed)
##
## Prior probabilities of groups:
## No Yes
## 0.8448276 0.1551724
##
## Group means:
## Age BusinessTravelTravel_Frequently BusinessTravelTravel_Rarely
## No 0.08403305 0.1729323 0.7142857
## Yes -0.45751327 0.2982456 0.6491228
## DailyRate DepartmentResearch & Development DepartmentSales
## No 0.02116198 0.6745435 0.2857143
## Yes -0.11521522 0.5730994 0.3742690
## DistanceFromHome Education EnvironmentSatisfaction GenderMale
## No -0.0229024 0.01866583 0.04521538 0.6165414
## Yes 0.1246908 -0.10162506 -0.24617262 0.6140351
## HourlyRate JobInvolvement JobLevel JobRoleHuman Resources
## No 0.004268794 0.05758122 0.07923603 0.02900107
## Yes -0.023241212 -0.31349774 -0.43139617 0.05263158
## JobRoleLaboratory Technician JobRoleManager JobRoleManufacturing Director
## No 0.1578947 0.08485499 0.11278195
## Yes 0.2631579 0.01754386 0.05847953
## JobRoleResearch Director JobRoleResearch Scientist JobRoleSales Executive
## No 0.065520945 0.1965628 0.2126745
## Yes 0.005847953 0.1988304 0.2105263
## JobRoleSales Representative JobSatisfaction MaritalStatusMarried
## No 0.04189044 0.04304638 0.4908700
## Yes 0.15204678 -0.23436360 0.3567251
## MaritalStatusSingle MonthlyIncome MonthlyRate NumCompaniesWorked
## No 0.2717508 0.07725473 0.004449091 -0.008251001
## Yes 0.5087719 -0.42060909 -0.024222828 0.044922115
## OverTimeYes PercentSalaryHike PerformanceRating RelationshipSatisfaction
## No 0.2352309 0.009277256 0.0009095309 0.01043468
## Yes 0.5146199 -0.050509507 -0.0049518905 -0.05681105
## StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance
## No 0.06013557 0.07774277 0.0242523 0.02494059
## Yes -0.32740475 -0.42326618 -0.1320403 -0.13578767
## YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion
## No 0.05923639 0.06950496 0.02213622
## Yes -0.32250923 -0.37841588 -0.12051942
## YearsWithCurrManager
## No 0.06144507
## Yes -0.33453425
##
## Coefficients of linear discriminants:
## LD1
## Age -0.31382997
## BusinessTravelTravel_Frequently 0.85006785
## BusinessTravelTravel_Rarely 0.37271972
## DailyRate -0.06225762
## DepartmentResearch & Development 0.16675536
## DepartmentSales 0.39728225
## DistanceFromHome 0.14502871
## Education 0.02819707
## EnvironmentSatisfaction -0.26383015
## GenderMale 0.07614074
## HourlyRate -0.02626801
## JobInvolvement -0.28249441
## JobLevel 0.01286254
## JobRoleHuman Resources 1.01170545
## JobRoleLaboratory Technician 0.77204358
## JobRoleManager 0.16090277
## JobRoleManufacturing Director 0.16169254
## JobRoleResearch Director 0.03828262
## JobRoleResearch Scientist 0.19140001
## JobRoleSales Executive 0.22250585
## JobRoleSales Representative 1.38783855
## JobSatisfaction -0.26522562
## MaritalStatusMarried 0.06232714
## MaritalStatusSingle 0.68210944
## MonthlyIncome -0.08575937
## MonthlyRate -0.01138975
## NumCompaniesWorked 0.21414690
## OverTimeYes 1.21323465
## PercentSalaryHike -0.10151441
## PerformanceRating 0.07169970
## RelationshipSatisfaction -0.11233641
## StockOptionLevel -0.09672352
## TotalWorkingYears -0.03993764
## TrainingTimesLastYear -0.13114119
## WorkLifeBalance -0.14639612
## YearsAtCompany 0.12803471
## YearsInCurrentRole -0.21860306
## YearsSinceLastPromotion 0.19916812
## YearsWithCurrManager -0.12850848
ct <- table(test.transformed$Attrition, predictions$class)
ct
##
## No Yes
## No 294 8
## Yes 40 26
diag(prop.table(ct, 1))
## No Yes
## 0.9735099 0.3939394
# # total percent correct
# Predicted classes
head(predictions$class, 6)
## [1] Yes No Yes No Yes No
## Levels: No Yes
# Predicted probabilities of class memebership.
head(predictions$posterior, 6)
## No Yes
## 1 0.4137657 0.586234260
## 2 0.9555013 0.044498702
## 3 0.1516319 0.848368093
## 4 0.9911454 0.008854612
## 5 0.2984373 0.701562672
## 6 0.9537101 0.046289939
# Linear discriminants
head(predictions$x, 3)
## LD1
## 1 1.8708056
## 2 -0.3830858
## 3 2.7772201
lda.data <- cbind(train.transformed, predict(fit)$x)
plot(fit)
ggplot(lda.data, aes(LD1, y=LD1+rnorm(nrow(lda.data)))) +
geom_point(aes(color = Attrition))